{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据集预处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用的数据集是DeepFashion中的`in-shop_clothes_retrieval_benchmark`数据集，在预处理前，需要制作数据集软链接到`../data/input`目录\n",
    "\n",
    "1. DeeFashion子数据集 in-shop 目录结构如下：\n",
    "\n",
    "```shell\n",
    ".\n",
    "├── Anno\n",
    "│   ├── list_attr_cloth.txt            # 衣服所有的属性列表\n",
    "│   ├── list_attr_items.txt            # 每个商品id对应的属性列表\n",
    "│   ├── list_bbox_inshop.txt           # 每张图片对应的bbox\n",
    "│   ├── list_description_inshop.json   # 每个商品id详细信息文字描述\n",
    "│   ├── list_item_inshop.txt           # 所有的商品id列表\n",
    "│   └── list_landmarks_inshop.txt      # 每张图片对应的关键点位置\n",
    "├── Eval\n",
    "│   └── list_eval_partition.txt        # 每张图片的train、val、test标签\n",
    "├── Img\n",
    "│   └── img\n",
    "│       ├── MEN\n",
    "│       │   ├── Denim\n",
    "│       │   │    ├── id_0000xxx\n",
    "│       │   │    ├── ...\n",
    "│       │   │    └── id_0000xxx\n",
    "│       │   ├── ...\n",
    "│       │   └── Tees_Tanks\n",
    "│       └── WOMEN\n",
    "│           ├── Blouses_Shirts\n",
    "│           ├── ...\n",
    "│           └── Tees_Tanks\n",
    "├── README.txt\n",
    "├── test.csv                           # 预处理后的test数据集\n",
    "├── train.csv                          # 预处理后的train数据集\n",
    "└── val.csv                            # 预处理后的val数据集\n",
    "```\n",
    "\n",
    "2. `../data/output/`文件结构如下：\n",
    "\n",
    "```shell\n",
    ".\n",
    "├── cache                              # 缓存文件夹\n",
    "├── logs                               # 日志文件夹\n",
    "│   └── fashionnet                     # 使用的网络名称\n",
    "│       ├── events.out.tfevents.1554190389.mysd-desktop  # 中间结果tensorboard文件\n",
    "│       └── fashionnet.csv             # 中间结果csv文件\n",
    "├── models                             # 模型文件\n",
    "│   ├── fashionnet                     # 使用的网络名称，指定网络保存在该文件夹下\n",
    "│   └── resnet50\n",
    "└── submits                            # 预测结果保存的文件夹\n",
    "```\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## dependency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-28T09:34:59.177494Z",
     "start_time": "2019-03-28T09:34:58.049539Z"
    }
   },
   "outputs": [],
   "source": [
    "import re\n",
    "from pathlib import Path\n",
    "from multiprocessing.pool import Pool\n",
    "\n",
    "from PIL import Image\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import imgaug as ia\n",
    "from imgaug import augmenters as iaa\n",
    "from tqdm import tqdm_notebook as tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-28T09:34:59.185098Z",
     "start_time": "2019-03-28T09:34:59.180343Z"
    }
   },
   "outputs": [],
   "source": [
    "LANDMARKS_INSHOP = '../data/input/Anno/list_landmarks_inshop.txt'\n",
    "EVAL_PARTITION = '../data/input/Eval/list_eval_partition.txt'\n",
    "ATTR_ITEMS = '../data/input/Anno/list_attr_items.txt'\n",
    "\n",
    "TRAIN_DF = '../data/input/train.csv'\n",
    "VAL_DF = '../data/input/val.csv'\n",
    "TEST_DF = '../data/input/test.csv'\n",
    "\n",
    "CATEGORY = ['Denim',\n",
    "            'Jackets_Vests',\n",
    "            'Pants',\n",
    "            'Shirts_Polos',\n",
    "            'Shorts',\n",
    "            'Suiting',\n",
    "            'Sweaters',\n",
    "            'Sweatshirts_Hoodies',\n",
    "            'Tees_Tanks',\n",
    "            'Blouses_Shirts',\n",
    "            'Cardigans',\n",
    "            'Denim',\n",
    "            'Dresses',\n",
    "            'Graphic_Tees',\n",
    "            'Jackets_Coats',\n",
    "            'Leggings',\n",
    "            'Pants',\n",
    "            'Rompers_Jumpsuits',\n",
    "            'Shorts',\n",
    "            'Skirts',\n",
    "            'Sweaters',\n",
    "            'Sweatshirts_Hoodies',\n",
    "            'Tees_Tanks'\n",
    "           ]\n",
    "\n",
    "num_attr = 463"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-28T09:35:06.010129Z",
     "start_time": "2019-03-28T09:34:59.186881Z"
    }
   },
   "outputs": [],
   "source": [
    "landmarks_inshop = pd.read_csv(\n",
    "    LANDMARKS_INSHOP,\n",
    "    sep=r' +',\n",
    "    header=1,\n",
    "    engine='python'\n",
    ")\n",
    "eval_partition = pd.read_csv(\n",
    "    EVAL_PARTITION,\n",
    "    sep=r' +',\n",
    "    header=1,\n",
    "    engine='python'\n",
    ")\n",
    "attr_items = pd.read_csv(\n",
    "    ATTR_ITEMS,\n",
    "    sep=r' ',\n",
    "    header=None,\n",
    "    skiprows=[0,1],\n",
    "    names=['item_id', ]+list(range(num_attr)),\n",
    "    engine='python'\n",
    ")\n",
    "# merge all attribute into one string\n",
    "attr_items['attribute_labels'] = ''\n",
    "for i in range(num_attr):\n",
    "    attr_items['attribute_labels'] += (' ' + attr_items[i].map(str))\n",
    "attr_items['attribute_labels'] = attr_items['attribute_labels'].map(lambda x: x[1:])\n",
    "attr_items = attr_items.drop(columns=range(463))\n",
    "\n",
    "eval_dict = dict([(m, c)for _, m, i, c in eval_partition.to_records()])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 编辑数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-28T09:35:06.581299Z",
     "start_time": "2019-03-28T09:35:06.012762Z"
    }
   },
   "outputs": [],
   "source": [
    "def edit_attrimg(index):\n",
    "    \"\"\"transform the No.index item's attribute_labels of attr_items.\n",
    "    Transform the label 1 or -1 into the index of attribute labels\n",
    "    \n",
    "    Arguments\n",
    "        index: the index of attr_items\n",
    "    \n",
    "    Return\n",
    "        None\n",
    "    \"\"\"\n",
    "    labels = attr_items['attribute_labels'].loc[index]\n",
    "    try:\n",
    "        ll = labels\n",
    "        labels = re.split(r' +', labels)\n",
    "    except TypeError as e:\n",
    "        raise TypeError(f'{e}, {ll}')\n",
    "#     labels = [str(item[0]) for item in enumerate(labels) if item[1] == '1']\n",
    "    labels = np.array(labels)\n",
    "    labels = np.argwhere(labels == '1').flatten().astype(np.str)\n",
    "    \n",
    "    labels = ' '.join(labels)\n",
    "    return labels\n",
    "\n",
    "\n",
    "pool = Pool()\n",
    "labels = pool.map(edit_attrimg, range(len(attr_items)))\n",
    "attr_items['attribute_labels'] = labels\n",
    "pool.close()\n",
    "pool.join()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 合并数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-28T09:35:08.788390Z",
     "start_time": "2019-03-28T09:35:08.738638Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>image_name</th>\n",
       "      <th>item_id</th>\n",
       "      <th>evaluation_status</th>\n",
       "      <th>landmark_visibility_1</th>\n",
       "      <th>landmark_location_x_1</th>\n",
       "      <th>landmark_location_y_1</th>\n",
       "      <th>landmark_visibility_2</th>\n",
       "      <th>landmark_location_x_2</th>\n",
       "      <th>landmark_location_y_2</th>\n",
       "      <th>landmark_visibility_3</th>\n",
       "      <th>...</th>\n",
       "      <th>landmark_location_x_6</th>\n",
       "      <th>landmark_location_y_6</th>\n",
       "      <th>landmark_visibility_7</th>\n",
       "      <th>landmark_location_x_7</th>\n",
       "      <th>landmark_location_y_7</th>\n",
       "      <th>landmark_visibility_8</th>\n",
       "      <th>landmark_location_x_8</th>\n",
       "      <th>landmark_location_y_8</th>\n",
       "      <th>attribute_labels</th>\n",
       "      <th>category_label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>img/WOMEN/Dresses/id_00000002/02_1_front.jpg</td>\n",
       "      <td>id_00000002</td>\n",
       "      <td>train</td>\n",
       "      <td>1</td>\n",
       "      <td>109</td>\n",
       "      <td>63</td>\n",
       "      <td>0</td>\n",
       "      <td>156</td>\n",
       "      <td>70</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>161.0</td>\n",
       "      <td>136.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>89.0</td>\n",
       "      <td>234.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>206.0</td>\n",
       "      <td>230.0</td>\n",
       "      <td>0 2 5 6 9 11 13 14 15 22 26 52 85 92 143 161 1...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>img/WOMEN/Dresses/id_00000002/02_2_side.jpg</td>\n",
       "      <td>id_00000002</td>\n",
       "      <td>train</td>\n",
       "      <td>1</td>\n",
       "      <td>127</td>\n",
       "      <td>59</td>\n",
       "      <td>0</td>\n",
       "      <td>145</td>\n",
       "      <td>61</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>141.0</td>\n",
       "      <td>125.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>123.0</td>\n",
       "      <td>217.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>132.0</td>\n",
       "      <td>229.0</td>\n",
       "      <td>0 2 5 6 9 11 13 14 15 22 26 52 85 92 143 161 1...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>img/WOMEN/Dresses/id_00000002/02_4_full.jpg</td>\n",
       "      <td>id_00000002</td>\n",
       "      <td>train</td>\n",
       "      <td>0</td>\n",
       "      <td>123</td>\n",
       "      <td>46</td>\n",
       "      <td>0</td>\n",
       "      <td>148</td>\n",
       "      <td>51</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>149.0</td>\n",
       "      <td>96.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>101.0</td>\n",
       "      <td>149.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>157.0</td>\n",
       "      <td>156.0</td>\n",
       "      <td>0 2 5 6 9 11 13 14 15 22 26 52 85 92 143 161 1...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>img/WOMEN/Dresses/id_00000002/02_7_additional.jpg</td>\n",
       "      <td>id_00000002</td>\n",
       "      <td>train</td>\n",
       "      <td>0</td>\n",
       "      <td>153</td>\n",
       "      <td>58</td>\n",
       "      <td>0</td>\n",
       "      <td>112</td>\n",
       "      <td>61</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>108.0</td>\n",
       "      <td>141.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>175.0</td>\n",
       "      <td>228.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>91.0</td>\n",
       "      <td>233.0</td>\n",
       "      <td>0 2 5 6 9 11 13 14 15 22 26 52 85 92 143 161 1...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>img/WOMEN/Skirts/id_00000003/02_1_front.jpg</td>\n",
       "      <td>id_00000003</td>\n",
       "      <td>train</td>\n",
       "      <td>1</td>\n",
       "      <td>79</td>\n",
       "      <td>131</td>\n",
       "      <td>1</td>\n",
       "      <td>125</td>\n",
       "      <td>130</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0 1 2 6 12 13 20 23 42 55 84 87 113 152 171 19...</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 29 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          image_name      item_id  \\\n",
       "0       img/WOMEN/Dresses/id_00000002/02_1_front.jpg  id_00000002   \n",
       "1        img/WOMEN/Dresses/id_00000002/02_2_side.jpg  id_00000002   \n",
       "2        img/WOMEN/Dresses/id_00000002/02_4_full.jpg  id_00000002   \n",
       "3  img/WOMEN/Dresses/id_00000002/02_7_additional.jpg  id_00000002   \n",
       "4        img/WOMEN/Skirts/id_00000003/02_1_front.jpg  id_00000003   \n",
       "\n",
       "  evaluation_status  landmark_visibility_1  landmark_location_x_1  \\\n",
       "0             train                      1                    109   \n",
       "1             train                      1                    127   \n",
       "2             train                      0                    123   \n",
       "3             train                      0                    153   \n",
       "4             train                      1                     79   \n",
       "\n",
       "   landmark_location_y_1  landmark_visibility_2  landmark_location_x_2  \\\n",
       "0                     63                      0                    156   \n",
       "1                     59                      0                    145   \n",
       "2                     46                      0                    148   \n",
       "3                     58                      0                    112   \n",
       "4                    131                      1                    125   \n",
       "\n",
       "   landmark_location_y_2  landmark_visibility_3  ...  landmark_location_x_6  \\\n",
       "0                     70                      0  ...                  161.0   \n",
       "1                     61                      1  ...                  141.0   \n",
       "2                     51                      0  ...                  149.0   \n",
       "3                     61                      0  ...                  108.0   \n",
       "4                    130                      0  ...                    0.0   \n",
       "\n",
       "   landmark_location_y_6  landmark_visibility_7  landmark_location_x_7  \\\n",
       "0                  136.0                    0.0                   89.0   \n",
       "1                  125.0                    1.0                  123.0   \n",
       "2                   96.0                    0.0                  101.0   \n",
       "3                  141.0                    0.0                  175.0   \n",
       "4                    0.0                    0.0                    0.0   \n",
       "\n",
       "   landmark_location_y_7  landmark_visibility_8  landmark_location_x_8  \\\n",
       "0                  234.0                    0.0                  206.0   \n",
       "1                  217.0                    0.0                  132.0   \n",
       "2                  149.0                    0.0                  157.0   \n",
       "3                  228.0                    0.0                   91.0   \n",
       "4                    0.0                    0.0                    0.0   \n",
       "\n",
       "   landmark_location_y_8                                   attribute_labels  \\\n",
       "0                  230.0  0 2 5 6 9 11 13 14 15 22 26 52 85 92 143 161 1...   \n",
       "1                  229.0  0 2 5 6 9 11 13 14 15 22 26 52 85 92 143 161 1...   \n",
       "2                  156.0  0 2 5 6 9 11 13 14 15 22 26 52 85 92 143 161 1...   \n",
       "3                  233.0  0 2 5 6 9 11 13 14 15 22 26 52 85 92 143 161 1...   \n",
       "4                    0.0  0 1 2 6 12 13 20 23 42 55 84 87 113 152 171 19...   \n",
       "\n",
       "   category_label  \n",
       "0              12  \n",
       "1              12  \n",
       "2              12  \n",
       "3              12  \n",
       "4              19  \n",
       "\n",
       "[5 rows x 29 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = eval_partition\\\n",
    "    .merge(landmarks_inshop, on='image_name')\\\n",
    "    .merge(attr_items, on='item_id')\\\n",
    "    .drop(columns=['clothes_type', 'variation_type'])\\\n",
    "    .fillna(0)\n",
    "df['category_label'] = df['image_name'].map(lambda x: CATEGORY.index(x.split('/')[-3]))\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 分割数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-28T09:35:18.420417Z",
     "start_time": "2019-03-28T09:35:18.400558Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "train      26338\n",
       "query      14441\n",
       "gallery    12811\n",
       "Name: evaluation_status, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['evaluation_status'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-28T09:35:19.567033Z",
     "start_time": "2019-03-28T09:35:19.459611Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(26338, 14441, 12811)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# split the dataframe by evaluation_status\n",
    "train_df = df[df['evaluation_status'] == 'train']\\\n",
    "            .drop(columns=['evaluation_status'])\\\n",
    "            .reset_index(drop=True)\n",
    "val_df = df[df['evaluation_status'] == 'query']\\\n",
    "            .drop(columns=['evaluation_status'])\\\n",
    "            .reset_index(drop=True)\n",
    "test_df = df[df['evaluation_status'] == 'gallery']\\\n",
    "            .drop(columns=['evaluation_status'])\\\n",
    "            .reset_index(drop=True)\n",
    "\n",
    "len(train_df), len(val_df), len(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-28T09:35:22.218618Z",
     "start_time": "2019-03-28T09:35:20.411009Z"
    }
   },
   "outputs": [],
   "source": [
    "# save the dataframe\n",
    "train_df.to_csv(TRAIN_DF, index=False)\n",
    "val_df.to_csv(VAL_DF, index=False)\n",
    "test_df.to_csv(TEST_DF, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "image-search",
   "language": "python",
   "name": "image-search"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "165px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
